class: center, middle, inverse, title-slide .title[ # Databases with R (part 2) ] .subtitle[ ##
https://pos.it/databases-24
] .author[ ### Kirill Müller ] .author[ ### cynkra GmbH ] .date[ ### August 12, 2024 ] --- <style type="text/css"> .pull-left { margin-top: -25px; } .pull-right { margin-top: -25px; } .r.hljs.remark-code { font-size: 20px; } .remark-code { font-size: 20px; } .fansi.fansi-error { color: #D70040; } pre { white-space: pre-wrap; /* css-3 */ white-space: -moz-pre-wrap; /* Mozilla, since 1999 */ white-space: -pre-wrap; /* Opera 4-6 */ white-space: -o-pre-wrap; /* Opera 7 */ word-wrap: break-word; /* Internet Explorer 5.5+ */ margin-bottom: 0px; } .reg-margin-bottom { margin-bottom: 25px; } .remark-slide-content { padding-top: 0px; padding-bottom: 0px; } .remark-slide-scaler { overflow-y: auto; } .font17 { font-size: 17px; } .font14 { font-size: 14px; } .script-number-font { color: inherit !important; opacity: 0.9 !important; transform: scale(1.8); transform-origin: top left; } .script-number { color: inherit !important; opacity: 0.9 !important; position: absolute; top: 12px; right: 20px; transform: scale(1.8); transform-origin: top right; } .exercise-timer { position: fixed; bottom: 12px; left: 20px; transform: scale(1.8); transform-origin: bottom left; } .exercise-timer img { max-width: none; /* Prevent the image from being constrained by the container */ } </style> # Learning goals <img src="data:image/png;base64,#databases-2_files/figure-html/fig-learn-duckdb-1.png" style="display: block; margin: auto;" /> --- class: middle <img src="data:image/png;base64,#images/triangle.svg" width="70%" height="70%" style="display: block; margin: auto;" /> --- background-image: url("data:image/png;base64,#images/21.webp") background-size: contain background-position: 100% 100% # Working with database dumps .pull-left[ - Single file - Partitioned dataset - Parquet - CSV <div class="script-number-font"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> ```r library(DBI) library(dplyr) requireNamespace("duckdb") requireNamespace("duckplyr") ``` ] --- background-image: url("data:image/png;base64,#images/21-frame.webp") background-size: contain background-position: 100% 100% # Working with database dumps .pull-left[ - Single file - Partitioned dataset - Parquet - CSV <div class="script-number-font"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> ```r library(DBI) library(dplyr) requireNamespace("duckdb") requireNamespace("duckplyr") ``` ] --- background-image: url("data:image/png;base64,#images/magic-hat-and-wand-nath-r.svg") background-size: 25% background-position: 100% 100% # Prepare dump Imagine someone gave you a Parquet file. ``` r arrow::write_parquet(nycflights13::flights, "flights.parquet") fs::file_size("flights.parquet") ``` ``` 5.43M ``` ``` r lobstr::obj_size(nycflights13::flights) ``` ``` 40.65 MB ``` <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Processing the local data ``` r df <- arrow::read_parquet("flights.parquet") df ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 336,776 × 19</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>dep_t…¹</span> <span style='font-weight: bold;'>sched…²</span> <span style='font-weight: bold;'>dep_d…³</span> <span style='font-weight: bold;'>arr_t…⁴</span> <span style='font-weight: bold;'>sched…⁵</span> <span style='font-weight: bold;'>arr_d…⁶</span> <span style='font-weight: bold;'>carrier</span> <span style='font-weight: bold;'>flight</span> <span style='font-weight: bold;'>tailnum</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 517 515 2 830 819 11 UA <span style='text-decoration: underline;'>1</span>545 N14228 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 1 533 529 4 850 830 20 UA <span style='text-decoration: underline;'>1</span>714 N24211 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 1 542 540 2 923 850 33 AA <span style='text-decoration: underline;'>1</span>141 N619AA <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 1 544 545 -<span style='color: #BB0000;'>1</span> <span style='text-decoration: underline;'>1</span>004 <span style='text-decoration: underline;'>1</span>022 -<span style='color: #BB0000;'>18</span> B6 725 N804JB <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 1 554 600 -<span style='color: #BB0000;'>6</span> 812 837 -<span style='color: #BB0000;'>25</span> DL 461 N668DN <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 1 554 558 -<span style='color: #BB0000;'>4</span> 740 728 12 UA <span style='text-decoration: underline;'>1</span>696 N39463 <span style='color: #949494;'># ℹ 336,770 more rows</span> <span style='color: #949494;'># ℹ abbreviated names: ¹dep_time, ²sched_dep_time, ³dep_delay, ⁴arr_time, …</span> </CODE></PRE> <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Read as Arrow dataset .pull-left[ ``` r ds <- arrow::open_dataset("flights.parquet") ds ``` ``` FileSystemDataset with 1 Parquet file 19 columns year: int32 month: int32 day: int32 dep_time: int32 sched_dep_time: int32 dep_delay: double arr_time: int32 sched_arr_time: int32 arr_delay: double carrier: string flight: int32 tailnum: string origin: string dest: string air_time: double distance: double hour: double minute: double time_hour: timestamp[us] ``` ] .pull-right[ ``` r ds |> count(year, month, day) |> collect() ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 365 × 4</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 842 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 2 943 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 3 914 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 4 915 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 5 720 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 6 832 <span style='color: #949494;'># ℹ 359 more rows</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Register as duckdb lazy table .pull-left[ ``` r con_memory <- dbConnect( duckdb::duckdb(), dbdir = ":memory:") tbl <- duckdb::tbl_file( con_memory, "flights.parquet") tbl |> explain() ``` ``` <SQL> FROM 'flights.parquet' <PLAN> physical_plan ┌───────────────────────────┐ │ PARQUET_SCAN │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ │ dep_time │ │ sched_dep_time │ │ dep_delay │ │ arr_time │ │ sched_arr_time │ │ arr_delay │ │ carrier │ │ flight │ │ tailnum │ │ origin │ │ dest │ │ air_time │ │ distance │ │ hour │ │ minute │ │ time_hour │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ EC: 336776 │ └───────────────────────────┘ ``` ] .pull-right[ ``` r tbl |> count(year, month, day) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: SQL [?? x 4]</span> <span style='color: #949494;'># Database: DuckDB v1.0.0</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 842 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 2 943 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 3 914 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 4 915 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 5 720 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 6 832 <span style='color: #949494;'># ℹ more rows</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Register as duckplyr lazy data frame .reg-margin-bottom[ ``` r duckplyr_df <- duckplyr::duckplyr_df_from_parquet("flights.parquet") class(duckplyr_df) ``` ``` [1] "duckplyr_df" "tbl_df" "tbl" "data.frame" ``` ] .pull-left[ ``` r duckplyr_df |> explain() ``` ``` ┌───────────────────────────┐ │ READ_PARQUET │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ │ dep_time │ │ sched_dep_time │ │ dep_delay │ │ arr_time │ │ sched_arr_time │ │ arr_delay │ │ carrier │ │ flight │ │ tailnum │ │ origin │ │ dest │ │ air_time │ │ distance │ │ hour │ │ minute │ │ time_hour │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ EC: 336776 │ └───────────────────────────┘ ``` ] .pull-right[ ``` r duckplyr_df ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 336,776 × 19</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>dep_time</span> <span style='font-weight: bold;'>sched_d…¹</span> <span style='font-weight: bold;'>dep_d…²</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 517 515 2 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 1 533 529 4 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 1 542 540 2 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 1 544 545 -<span style='color: #BB0000;'>1</span> <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 1 554 600 -<span style='color: #BB0000;'>6</span> <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 1 554 558 -<span style='color: #BB0000;'>4</span> <span style='color: #949494;'># ℹ 336,770 more rows</span> <span style='color: #949494;'># ℹ abbreviated names: ¹sched_dep_time, …</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # The future: Register as duckplyr lazy data frame .pull-left[ ``` r duckplyr_df |> count(year, month, day) |> filter(month == 1) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 31 × 4</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 842 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 2 943 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 3 914 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 4 915 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 5 720 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 6 832 <span style='color: #949494;'># ℹ 25 more rows</span> </CODE></PRE> ] .pull-right[ ``` r duckplyr_df |> count(year, month, day) |> filter(month == 1) |> explain() ``` ``` ┌───────────────────────────┐ │ ORDER_BY │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ ORDERS: │ │dataframe_105553178666960_7│ │ 62464429."year" ASC │ │dataframe_105553178666960_7│ │ 62464429."month" ASC │ │dataframe_105553178666960_7│ │ 62464429."day" ASC │ └─────────────┬─────────────┘ ┌─────────────┴─────────────┐ │ PROJECTION │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ │ n │ └─────────────┬─────────────┘ ┌─────────────┴─────────────┐ │ HASH_GROUP_BY │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ #0 │ │ #1 │ │ #2 │ │ count_star() │ └─────────────┬─────────────┘ ┌─────────────┴─────────────┐ │ PROJECTION │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ └─────────────┬─────────────┘ ┌─────────────┴─────────────┐ │ FILTER │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ r_base::==(month, 1.0) │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ EC: 67355 │ └─────────────┬─────────────┘ ┌─────────────┴─────────────┐ │ R_DATAFRAME_SCAN │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ data.frame │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ EC: 336776 │ └───────────────────────────┘ ``` ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- background-image: url("data:image/png;base64,#images/magic-hat-and-wand-nath-r.svg") background-size: 25% background-position: 100% 450px # Prepare partitioned dump .pull-left[ Imagine someone gave you a bunch of Parquet files. ``` r arrow::write_dataset( nycflights13::flights, "flights-part/", partitioning = c("year", "month") ) fs::dir_tree("flights-part") ``` ] .pull-right[ <PRE class="fansi fansi-output"><CODE><span style='color: #00BBBB; font-weight: bold;'>flights-part</span> └── <span style='color: #00BBBB; font-weight: bold;'>year=2013</span> ├── <span style='color: #00BBBB; font-weight: bold;'>month=1</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=10</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=11</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=12</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=2</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=3</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=4</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=5</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=6</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=7</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=8</span> │ └── part-0.parquet └── <span style='color: #00BBBB; font-weight: bold;'>month=9</span> └── part-0.parquet </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Read partitioned data ``` r tbl_part <- duckdb::tbl_query( con_memory, "read_parquet('flights-part/*/*/*.parquet', hive_partitioning = true)" ) tbl_part ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: SQL [?? x 19]</span> <span style='color: #949494;'># Database: DuckDB v1.0.0</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>dep_time</span> <span style='font-weight: bold;'>sched_dep…¹</span> <span style='font-weight: bold;'>dep_d…²</span> <span style='font-weight: bold;'>arr_t…³</span> <span style='font-weight: bold;'>sched…⁴</span> <span style='font-weight: bold;'>arr_d…⁵</span> <span style='font-weight: bold;'>carrier</span> <span style='font-weight: bold;'>flight</span> <span style='font-weight: bold;'>tailnum</span> <span style='font-weight: bold;'>origin</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #BCBCBC;'>1</span> 1 517 515 2 830 819 11 UA <span style='text-decoration: underline;'>1</span>545 N14228 EWR <span style='color: #BCBCBC;'>2</span> 1 533 529 4 850 830 20 UA <span style='text-decoration: underline;'>1</span>714 N24211 LGA <span style='color: #BCBCBC;'>3</span> 1 542 540 2 923 850 33 AA <span style='text-decoration: underline;'>1</span>141 N619AA JFK <span style='color: #BCBCBC;'>4</span> 1 544 545 -<span style='color: #BB0000;'>1</span> <span style='text-decoration: underline;'>1</span>004 <span style='text-decoration: underline;'>1</span>022 -<span style='color: #BB0000;'>18</span> B6 725 N804JB JFK <span style='color: #BCBCBC;'>5</span> 1 554 600 -<span style='color: #BB0000;'>6</span> 812 837 -<span style='color: #BB0000;'>25</span> DL 461 N668DN LGA <span style='color: #BCBCBC;'>6</span> 1 554 558 -<span style='color: #BB0000;'>4</span> 740 728 12 UA <span style='text-decoration: underline;'>1</span>696 N39463 EWR <span style='color: #949494;'># ℹ more rows</span> <span style='color: #949494;'># ℹ abbreviated names: ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, …</span> </CODE></PRE> <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- background-image: url("data:image/png;base64,#images/magic-hat-and-wand-nath-r.svg") background-size: 25% background-position: 100% 100% # Prepare CSV dump Imagine someone gave you a CSV file. ``` r readr::write_csv(nycflights13::flights, "flights.csv") fs::file_size("flights.csv") ``` ``` 29.6M ``` ``` r lobstr::obj_size(nycflights13::flights) ``` ``` 40.65 MB ``` <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Read CSV data with duckdb .pull-left[ ``` r tbl_csv <- duckdb::tbl_file( con_memory, "flights.csv") tbl_csv |> explain() ``` ``` <SQL> FROM 'flights.csv' <PLAN> physical_plan ┌───────────────────────────┐ │ READ_CSV_AUTO │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ │ dep_time │ │ sched_dep_time │ │ dep_delay │ │ arr_time │ │ sched_arr_time │ │ arr_delay │ │ carrier │ │ flight │ │ tailnum │ │ origin │ │ dest │ │ air_time │ │ distance │ │ hour │ │ minute │ │ time_hour │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ EC: 326882 │ └───────────────────────────┘ ``` ] .pull-right[ ``` r tbl_csv |> count(year, month, day) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: SQL [?? x 4]</span> <span style='color: #949494;'># Database: DuckDB v1.0.0</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 7 11 <span style='text-decoration: underline;'>1</span>006 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 7 15 999 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 7 16 996 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 7 22 <span style='text-decoration: underline;'>1</span>000 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 7 26 999 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 7 31 <span style='text-decoration: underline;'>1</span>001 <span style='color: #949494;'># ℹ more rows</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Read CSV data with duckplyr .pull-left[ ``` r duckplyr_df_csv <- duckplyr::duckplyr_df_from_csv("flights.csv") class(duckplyr_df_csv) ``` ``` [1] "duckplyr_df" "tbl_df" "tbl" [4] "data.frame" ``` ``` r duckplyr_df_csv |> explain() ``` ``` ┌───────────────────────────┐ │ READ_CSV_AUTO │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ year │ │ month │ │ day │ │ dep_time │ │ sched_dep_time │ │ dep_delay │ │ arr_time │ │ sched_arr_time │ │ arr_delay │ │ carrier │ │ flight │ │ tailnum │ │ origin │ │ dest │ │ air_time │ │ distance │ │ hour │ │ minute │ │ time_hour │ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ │ EC: 326882 │ └───────────────────────────┘ ``` ] .pull-right[ ``` r duckplyr_df_csv |> count(year, month, day) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 365 × 4</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 842 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 2 943 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 3 914 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 4 915 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 5 720 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 6 832 <span style='color: #949494;'># ℹ 359 more rows</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- # Create derived Parquet data with duckplyr .pull-left[ ``` r duckplyr_df_csv |> count(year, month, day) |> duckplyr::df_to_parquet( "flights-count.parquet") fs::file_size("flights-count.parquet") ``` ``` 2.13K ``` ] .pull-right[ ``` r duckplyr::duckplyr_df_from_parquet( "flights-count.parquet") ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 365 × 4</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 842 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 2 943 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 3 914 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 4 915 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 5 720 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 6 832 <span style='color: #949494;'># ℹ 359 more rows</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> --- background-image: url("data:image/png;base64,#images/21-frame.webp") background-size: contain background-position: 100% 100% .pull-left[ # Working with database dumps: Exercises 1. From the Parquet file, compute two lazy dbplyr tables showing the mean and median delay: - for each day of the week - for each month 2. Compute the same data as duckplyr lazy data frames. 3. Store this data as a Parquet file. 4. Read the Parquet file and plot the data. ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_21.R"><title>Script: db_21.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_21.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_21.R</text></g></svg></div> <div class="exercise-timer"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="84" height="20" role="img" aria-label="Timer: 10:00"><title>Timer: 10:00</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="84" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="41" height="20" fill="#555"/><rect x="41" width="43" height="20" fill="#007ec6"/><rect width="84" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="215" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="310">Timer</text><text x="215" y="140" transform="scale(.1)" fill="#fff" textLength="310">Timer</text><text aria-hidden="true" x="615" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">10:00</text><text x="615" y="140" transform="scale(.1)" fill="#fff" textLength="330">10:00</text></g></svg></div> --- # DuckDB showcase .middle[ <img src="data:image/png;base64,#images/hub.svg" width="130%" height="130%" style="display: block; margin: auto;" /> ] --- # Application: DBI + dbplyr and pivoting .pull-left[ ``` r con_memory <- dbConnect( duckdb::duckdb(), dbdir = ":memory:") tbl <- duckdb::tbl_file( con_memory, "flights.parquet") daily_flights_by_dest <- tbl |> count(year, month, day, dest) ``` ] .pull-right[ ``` r daily_flights_by_dest ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: SQL [?? x 5]</span> <span style='color: #949494;'># Database: DuckDB v1.0.0</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>dest</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 6 30 PHX 13 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 6 30 LGB 2 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 6 30 PWM 7 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 6 30 AUS 7 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 6 30 SMF 1 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 6 30 MCI 3 <span style='color: #949494;'># ℹ more rows</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> --- # Application: DBI + dbplyr and pivoting .pull-left[ ``` r daily_flights_by_dest_sql <- daily_flights_by_dest |> dbplyr::sql_render() daily_flights_by_dest_sql ``` ``` <SQL> SELECT "year", "month", "day", dest, COUNT(*) AS n FROM (FROM 'flights.parquet') q01 GROUP BY "year", "month", "day", dest ``` ] .pull-right[ ``` <SQL> PIVOT (SELECT "year", "month", "day", dest, COUNT(*) AS n FROM (FROM 'flights.parquet') q01 GROUP BY "year", "month", "day", dest) ON dest USING SUM(n) ``` ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> --- # Application: DBI + dbplyr and pivoting ``` r as_tibble(dbGetQuery(con_memory, pivot_sql)) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># A tibble: 365 × 108</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>ABQ</span> <span style='font-weight: bold;'>ACK</span> <span style='font-weight: bold;'>ALB</span> <span style='font-weight: bold;'>ANC</span> <span style='font-weight: bold;'>ATL</span> <span style='font-weight: bold;'>AUS</span> <span style='font-weight: bold;'>AVL</span> <span style='font-weight: bold;'>BDL</span> <span style='font-weight: bold;'>BGR</span> <span style='font-weight: bold;'>BHM</span> <span style='font-weight: bold;'>BNA</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 3 <span style='color: #BB0000;'>NA</span> 40 6 1 2 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 10 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 2 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 3 <span style='color: #BB0000;'>NA</span> 51 7 1 2 <span style='color: #BB0000;'>NA</span> 1 11 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 3 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 2 <span style='color: #BB0000;'>NA</span> 49 6 <span style='color: #BB0000;'>NA</span> 1 <span style='color: #BB0000;'>NA</span> 1 11 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 4 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 2 <span style='color: #BB0000;'>NA</span> 48 6 <span style='color: #BB0000;'>NA</span> 1 <span style='color: #BB0000;'>NA</span> 1 11 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 5 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 2 <span style='color: #BB0000;'>NA</span> 35 4 <span style='color: #BB0000;'>NA</span> 2 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 8 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 6 <span style='color: #BB0000;'>NA</span> <span style='color: #BB0000;'>NA</span> 2 <span style='color: #BB0000;'>NA</span> 41 5 <span style='color: #BB0000;'>NA</span> 1 <span style='color: #BB0000;'>NA</span> 1 13 <span style='color: #949494;'># ℹ 359 more rows</span> <span style='color: #949494;'># ℹ 94 more variables: </span><span style='color: #949494; font-weight: bold;'>BOS</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>BQN</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>BTV</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>BUF</span><span style='color: #949494;'> <dbl>, </span><span style='color: #949494; font-weight: bold;'>BUR</span><span style='color: #949494;'> <dbl>, …</span> </CODE></PRE> <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> --- # Application: DBI + dbplyr and pivoting .pull-left[ ``` r system.time( as_tibble(dbGetQuery( con_memory, pivot_sql)) ) ``` ``` user system elapsed 0.083 0.003 0.041 ``` ] .pull-right[ ``` r system.time( nycflights13::flights |> count(year, month, day, dest) |> tidyr::pivot_wider( names_from = dest, values_from = n) ) ``` ``` user system elapsed 0.149 0.001 0.150 ``` ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> --- # Application: DBI + dbplyr and pivoting ``` r write_pivot_sql <- paste0( "COPY (", pivot_sql, ") TO 'pivot.parquet' (FORMAT PARQUET)" ) dbExecute(con_memory, write_pivot_sql) ``` ``` [1] 365 ``` ``` r q_unpivot_dyn <- "(SELECT * FROM (UNPIVOT 'pivot.parquet' ON COLUMNS(* EXCLUDE (year, month, day)) INTO NAME dest VALUE n))" SQL(q_unpivot_dyn) ``` ``` <SQL> (SELECT * FROM (UNPIVOT 'pivot.parquet' ON COLUMNS(* EXCLUDE (year, month, day)) INTO NAME dest VALUE n)) ``` <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> --- # Application: DBI + dbplyr and pivoting ``` r tbl(con_memory, from = q_unpivot_dyn) ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: SQL [?? x 5]</span> <span style='color: #949494;'># Database: DuckDB v1.0.0</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>dest</span> <span style='font-weight: bold;'>n</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><chr></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 ALB 3 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 1 ATL 40 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 1 AUS 6 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 1 AVL 1 <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 1 BDL 2 <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 1 BNA 10 <span style='color: #949494;'># ℹ more rows</span> </CODE></PRE> <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> --- background-image: url("data:image/png;base64,#images/21-frame.webp") background-size: contain background-position: 100% 100% .pull-left[ # DuckDB showcase: Breakout Discuss in groups of 4-6 people: - How often is the data you work with updated? Realtime, daily, monthly, never? - Does the size of the data you are working with justify the overhead of working with local database dumps? - Could the proposed workflow be implemented in your current projects? What constraints would you face? ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_22.R"><title>Script: db_22.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_22.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_22.R</text></g></svg></div> <div class="exercise-timer"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="84" height="20" role="img" aria-label="Timer: 10:00"><title>Timer: 10:00</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="84" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="41" height="20" fill="#555"/><rect x="41" width="43" height="20" fill="#007ec6"/><rect width="84" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="215" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="310">Timer</text><text x="215" y="140" transform="scale(.1)" fill="#fff" textLength="310">Timer</text><text aria-hidden="true" x="615" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">10:00</text><text x="615" y="140" transform="scale(.1)" fill="#fff" textLength="330">10:00</text></g></svg></div> --- class: middle <img src="data:image/png;base64,#images/triangle.svg" width="70%" height="70%" style="display: block; margin: auto;" /> --- background-image: url("data:image/png;base64,#images/22.webp") background-size: 40% background-position: 100% 100% # Full circle .pull-left[ - Dump as a single file - Dump as a partitioned dataset <div class="script-number-font"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> ```r library(DBI) library(dplyr) requireNamespace("arrow") ``` ] --- background-image: url("data:image/png;base64,#images/22-frame.webp") background-size: 40% background-position: 100% 100% # Full circle .pull-left[ - Dump as a single file - Dump as a partitioned dataset <div class="script-number-font"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> ```r library(DBI) library(dplyr) requireNamespace("arrow") ``` ] --- background-image: url("data:image/png;base64,#images/magic-hat-and-wand-nath-r.svg") background-size: 25% background-position: 100% 450px # Prepare database .pull-left[ Imagine someone has set up a database for you. ``` r con_rw <- dbConnect( duckdb::duckdb(), dbdir = "flights.duckdb" ) copy_to( con_rw, nycflights13::flights, name = "flights", temporary = FALSE ) dbDisconnect(con_rw) ``` ] .pull-right[ ``` r con <- dbConnect( duckdb::duckdb(), dbdir = "flights.duckdb", read_only = TRUE ) flights_duckdb <- tbl(con, "flights") flights_duckdb ``` <PRE class="fansi fansi-output"><CODE><span style='color: #949494;'># Source: table<flights> [?? x 19]</span> <span style='color: #949494;'># Database: DuckDB v1.0.0</span> <span style='font-weight: bold;'>year</span> <span style='font-weight: bold;'>month</span> <span style='font-weight: bold;'>day</span> <span style='font-weight: bold;'>dep_time</span> <span style='font-weight: bold;'>sched_d…¹</span> <span style='font-weight: bold;'>dep_d…²</span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><int></span> <span style='color: #949494; font-style: italic;'><dbl></span> <span style='color: #BCBCBC;'>1</span> <span style='text-decoration: underline;'>2</span>013 1 1 517 515 2 <span style='color: #BCBCBC;'>2</span> <span style='text-decoration: underline;'>2</span>013 1 1 533 529 4 <span style='color: #BCBCBC;'>3</span> <span style='text-decoration: underline;'>2</span>013 1 1 542 540 2 <span style='color: #BCBCBC;'>4</span> <span style='text-decoration: underline;'>2</span>013 1 1 544 545 -<span style='color: #BB0000;'>1</span> <span style='color: #BCBCBC;'>5</span> <span style='text-decoration: underline;'>2</span>013 1 1 554 600 -<span style='color: #BB0000;'>6</span> <span style='color: #BCBCBC;'>6</span> <span style='text-decoration: underline;'>2</span>013 1 1 554 558 -<span style='color: #BB0000;'>4</span> <span style='color: #949494;'># ℹ more rows</span> <span style='color: #949494;'># ℹ abbreviated names: ¹sched_dep_time, …</span> </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> --- # Method 1: via local data frame .pull-left[ ``` r flights_duckdb |> filter(month == 1) |> collect() |> duckplyr::df_to_parquet( "flights-jan.parquet") ``` ] .pull-right[ ``` r flights_duckdb |> collect() |> duckplyr::df_to_parquet("flights.parquet") ``` ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> --- # Method 2: via DBI .pull-left[ ## Setup ``` r sql_jan <- flights_duckdb |> filter(month == 1) |> dbplyr::sql_render() sql_jan ``` ``` <SQL> SELECT flights.* FROM flights WHERE ("month" = 1.0) ``` ``` r fs::dir_create("flights-arrow") ``` ] .pull-right[ ``` r res <- dbSendQuery(con, sql_jan) i <- 0 repeat { df <- dbFetch(res, n = 10000) if (nrow(df) == 0) break path <- fs::path( "flights-arrow", sprintf("part-%05d.parquet", i)) duckplyr::df_to_parquet(df, path) i <- i + 1 } dbClearResult(res) ``` ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> --- # Method 3: via parquetize ``` r parquetize::dbi_to_parquet( con, sql_jan, "flights-parquetized", max_rows = 10000 ) fs::dir_tree("flights-parquetized/") ``` <PRE class="fansi fansi-output"><CODE><span style='color: #00BBBB; font-weight: bold;'>flights-parquetized/</span> ├── part-1-10000.parquet ├── part-10001-20000.parquet └── part-20001-27004.parquet </CODE></PRE> <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> --- background-image: url("data:image/png;base64,#images/magic-hat-and-wand-nath-r.svg") background-size: 25% background-position: 100% 100% # Method 4: via DBI and arrow .pull-left[ ``` r con_adbi <- dbConnect( adbi::adbi(duckdb::duckdb_adbc()), path = "flights.duckdb" ) sql <- "SELECT * FROM flights" ``` ] .pull-right[ ``` r system.time({ res <- dbSendQueryArrow(con_adbi, sql) stream <- dbFetchArrow(res) arrow::write_dataset( arrow::as_record_batch_reader(stream), "flights-adbi/", max_rows_per_file = 10000 ) dbClearResult(res) }) ``` ``` user system elapsed 0.372 0.159 0.175 ``` ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> --- # Adding partitions to a dataset .pull-left[ ``` r write_month <- function(month) { sql <- flights_duckdb |> filter(month == !!month) |> dbplyr::sql_render() dir <- fs::path( "flights-part-manual", sprintf("month=%d", month)) fs::dir_create(dir) df <- dbGetQuery(con, sql) duckplyr::df_to_parquet( df, fs::path(dir, "part-0.parquet")) } ``` ] .pull-right[ ``` r write_month(1) write_month(2) write_month(3) fs::dir_tree("flights-part-manual") ``` <PRE class="fansi fansi-output"><CODE><span style='color: #00BBBB; font-weight: bold;'>flights-part-manual</span> ├── <span style='color: #00BBBB; font-weight: bold;'>month=1</span> │ └── part-0.parquet ├── <span style='color: #00BBBB; font-weight: bold;'>month=2</span> │ └── part-0.parquet └── <span style='color: #00BBBB; font-weight: bold;'>month=3</span> └── part-0.parquet </CODE></PRE> ] <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> --- background-image: url("data:image/png;base64,#images/22-frame.webp") background-size: 40% background-position: 100% 100% # Full circle: Exercises .pull-left[ Write code to create a partitioned dataset with the `flights` table, partitioned by `origin`. - Hint: The dataset only contains flights departing from New York City airports. <div class="script-number"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="Script: db_23.R"><title>Script: db_23.R</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="43" height="20" fill="#555"/><rect x="43" width="57" height="20" fill="#fe7d37"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="225" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">Script</text><text x="225" y="140" transform="scale(.1)" fill="#fff" textLength="330">Script</text><text aria-hidden="true" x="705" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">db_23.R</text><text x="705" y="140" transform="scale(.1)" fill="#fff" textLength="470">db_23.R</text></g></svg></div> <div class="exercise-timer"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="84" height="20" role="img" aria-label="Timer: 10:00"><title>Timer: 10:00</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="84" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="41" height="20" fill="#555"/><rect x="41" width="43" height="20" fill="#007ec6"/><rect width="84" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="215" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="310">Timer</text><text x="215" y="140" transform="scale(.1)" fill="#fff" textLength="310">Timer</text><text aria-hidden="true" x="615" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="330">10:00</text><text x="615" y="140" transform="scale(.1)" fill="#fff" textLength="330">10:00</text></g></svg></div> ] --- class: inverse, middle, center # https://posit-conf-2024.github.io/databases/part-3.html